/*
 * Copyright (c) 2018, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once
#include <stdio.h>
#include <cmath>
#include "cuda_utils.h"
#define IDX2C(i, j, ld) (j * ld + i)

namespace kf {

using namespace MLCommon;
// helper kernels
// (lkf)
template <typename T>
__global__ void Linear_KF_ID_kernel(T *w, int dim) {
  int j = threadIdx.x + blockDim.x * blockIdx.x;
  int i = threadIdx.y + blockDim.y * blockIdx.y;
  if (i < dim && j < dim) {
    if (i == j)
      w[IDX2C(i, j, dim)] = 1.0;
    else
      w[IDX2C(i, j, dim)] = 0.0;
  }
}
// (enkf)
template <typename T>
__global__ void vctwiseAccumulate_kernel(const int nPoints, const int dim,
                                         const T scalar, const T *X, T *x) {
  int idx = threadIdx.x + blockDim.x * blockIdx.x;
  int col = idx % dim;
  int row = idx / dim;
  if (col < dim && row < nPoints) myAtomicAdd(x + col, scalar * X[idx]);
}

template <typename T>
__global__ void En_KF_normalize(const int divider, const int n, T *x) {
  int xi = threadIdx.x + blockDim.x * blockIdx.x;
  if (xi < n) x[xi] = x[xi] / divider;
}

template <typename T>
__global__ void vctwiseAdd_kernel(const int col, const int row, T scalar,
                                  const T *in_m, const T *v, T *out_m) {
  int m_i = threadIdx.x + blockDim.x * blockIdx.x;
  int v_i = m_i % row;
  if (m_i < row * col) out_m[m_i] = in_m[m_i] + scalar * v[v_i];
}

/**
 * @brief identity matrix generating function
 * @tparam T the data type
 * @param I the out matrix
 * @param dim dimension of I
 */
template <typename T>
void make_ID_matrix(T *I, int dim) {
  dim3 block(32, 32);
  dim3 grid(ceildiv(dim, (int)block.x), ceildiv(dim, (int)block.y));
  Linear_KF_ID_kernel<T><<<grid, block>>>(I, dim);
  CUDA_CHECK(cudaPeekAtLastError());
}

/**
 * @brief scales column vectors of X to accumulate into x
 * @tparam T the data type
 * @param X the input matrix
 * @param x the output vector
 * @param nPoints the number of columns in X
 * @param dim the dimension of x
 * @param scalar scaling factor applied to col of X
 *  which will get accumulated into x
 * @note
 * All the matrices are assumed to be stored in col major form
 * x = x + scalar * (X[0th col] + ... + X[nPoint'th col])
 */
template <typename T>
void vctwiseAccumulate(const int nPoints, const int dim, const T scalar,
                       const T *X, T *x) {
  dim3 block(64);
  dim3 grid(ceildiv(nPoints * dim, (int)block.x));
  vctwiseAccumulate_kernel<<<grid, block>>>(nPoints, dim, scalar, X, x);
  CUDA_CHECK(cudaPeekAtLastError());
}

/**
 * @brief add scaled vector to a matrix
 * @tparam T the data type
 * @param &in col number of col
 * @param &in row number of row
 * @param &in scalar scalar to scale vector with
 * @param &in in_m matrix in which the vector is
    supposed to be added
 * @param &in v vector to be scaled and added
 * @param &out out_m final added matrix
 *  generated by ver dan merwe method
 * @note
 * All the matrices are assumed to be stored in col major form
 * vectors are added as if adding a column vector
 */
template <typename T>
void vctwiseAdd(const int col, const int row, const T scalar, const T *in_m,
                const T *v, T *out_m) {
  dim3 block(64);
  dim3 grid(ceildiv(row * col, (int)block.x));
  vctwiseAdd_kernel<T><<<grid, block>>>(col, row, scalar, in_m, v, out_m);
  CUDA_CHECK(cudaPeekAtLastError());
}

};  // end of namespace kf
